rm(list = ls())
setwd("/Users/John/Dropbox/")

# --- Load Required Packages ---
library(readxl)
library(readr)
library(stringr)
library(tidyverse)

# --- Create ISIC-Year Grid ---

isic <- read_csv("JOP_Replication_Materials/data/raw/isic_list.csv")
isic <- isic[,-1]

isic <- as.data.frame(expand_grid(x = c(1995:2015), y = unique(isic$isic)))

colnames(isic) <- c("year", "isic")

# --- Load Strategic Industry Data ---

# R&D based
rnd <- read_excel("JOP_Replication_Materials/data/raw/strategic_isic.xlsx", sheet = "cumulative")

rnd <- pivot_longer(rnd, cols = c(9:21), names_to = "blah", values_to = "isic")

rnd <- as.data.frame(unique(na.omit(rnd$isic)))

colnames(rnd) <- "isic"
rnd$isic <- str_pad(rnd$isic, 4, pad = "0")
rnd$rnd <- 1

# Infrastructure
inf <- read_excel("JOP_Replication_Materials/data/raw/strategic_isic.xlsx", sheet = "infrastructure")

inf <- dplyr::select(inf, isic)

inf$inf <- 1

inf$isic <- as.character(inf$isic)
inf <- inf[!inf$isic %in% rnd$isic,]

# Dependency
dep <- read_excel("JOP_Replication_Materials/data/raw/strategic_isic.xlsx", sheet = "dependency")

dep <- dplyr::select(dep, isic)
dep$isic <- as.character(dep$isic)
dep$dep <- 1
dep$isic <- str_pad(dep$isic, 4, pad = "0")

# Combine all
colnames(rnd)[2] <- "strategic"
colnames(inf)[2] <- "strategic"
colnames(dep)[2] <- "strategic"

strategic <- as.data.frame(rbind(rnd, inf, dep))

# --- Load DV Data ---

# Load data
gp <- read_excel("JOP_Replication_Materials/data/raw/dv_clean.xlsx", sheet = "government_procurement") %>%
  pivot_longer(cols = 7:11, values_to = "isic")

# Combine DV

combined <- as.data.frame(gp)

combined <- filter(combined, !is.na(isic))

combined <- combined %>% 
  group_by(isic, year) %>%
  mutate(isic_year = length(unique(desc_chinese))) 

dv <- dplyr::select(combined, year, isic, isic_year)

dv <- distinct(dv, year, isic, .keep_all = TRUE)

dv$isic <- str_pad(dv$isic, 4, pad = "0")

# --- Load Covariate Data ---
hhi <- read_csv("JOP_Replication_Materials/data/processed/hhi_final.csv") 
output <- read_csv("JOP_Replication_Materials/data/processed/soe_share.csv") 

# --- Load CCD Processing Share Data ---
ccd_isic <- read_csv("JOP_Replication_Materials/data/processed/ccd_isic_processing.csv")

# --- Merge All Data ---

df <- isic %>%
  left_join(dv, by = c("year", "isic")) %>%
  left_join(strategic, by = "isic", relationship = "many-to-many") %>%
  left_join(ccd_isic, by = c("isic", "year")) %>%
  left_join(hhi, by = c("isic", "year")) %>%
  left_join(output, by = c("isic", "year"))

# --- Create Derived Variables ---
df <- df %>%
  mutate(
    isic_year = replace_na(isic_year, 0),
    strategic = replace_na(strategic, 0),
    isic2 = substr(isic, 1, 2),
    sqrtta = sqrt(isic_year),
    is_post = ifelse(year >= 2006, 1, 0)
  ) %>%
  group_by(isic) %>%
  mutate(median_share = median(processing_share, na.rm = TRUE)) %>%
  group_by(isic2) %>%
  mutate(
    med_hhi_isic2 = median(median_hhi_isic, na.rm = TRUE),
    med_soe_isic2 = median(median_soeshare_isic, na.rm = TRUE)
  ) %>%
  ungroup() %>%
  distinct(isic, year, .keep_all = TRUE) %>%
  dplyr::select(year, isic2, isic, isic_year, sqrtta, strategic, median_share,
                is_post, med_hhi_isic2, med_soe_isic2)

# --- Save Final Dataset ---
write_csv(df, "JOP_Replication_Materials/appendix/data/final_dataset_gp_only.csv")